# install.packages("remotes")
remotes::install_github("MatthewBJane/ThemePark")
library(ThemePark)
head(themepark_themes)
## theme creator
## 1 barbie Matthew B. Jané
## 2 oppenheimer Matthew B. Jané & Toki Liam
## 3 starwars Matthew B. Jané
## 4 zelda Alex Slavenko
## 5 terminator Alex Slavenko
## 6 spiderman Velu P.K. Immonen
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggridges)
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
weather_df =
rnoaa::meteo_pull_monitors(
c("USW00094728", "USW00022534", "USS0023B17S"),
var = c("PRCP", "TMIN", "TMAX"),
date_min = "2021-01-01",
date_max = "2022-12-31") |>
mutate(
name = recode(
id,
USW00094728 = "CentralPark_NY",
USW00022534 = "Molokai_HI",
USS0023B17S = "Waterhole_WA"),
tmin = tmin / 10,
tmax = tmax / 10) |>
select(name, id, everything())
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2023-09-28 10:19:14.527811 (8.524)
## file min/max dates: 1869-01-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly
## date created (size, mb): 2023-09-28 10:19:24.312619 (3.83)
## file min/max dates: 1949-10-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2023-09-28 10:19:23.539384 (0.994)
## file min/max dates: 1999-09-01 / 2023-09-30
ggplot(weather_df, aes(x = tmin, y = tmax)) +
geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).
You can also take/start with the df and pipe the scatterplot, and it will create exactly the same plot:
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).
Piping may make filtering easier. You don’t have to create a separate df just for New York, for example:
nyc_weather =
weather_df |>
filter(name == "CentralParkNY") |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point()
nyc_weather + geom_point()
color can be added to the entire plot in ggplot or to the points in geom_point
geom_smooth adds a smooth curve
se = FALSE removes standard error bars from the curve
alpha blending/shading make the points more transparent (0.3 = 30% solid 70% opaque)
ggplot(weather_df, aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
Plot with facets:
ggplot(weather_df, aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = 0.3) +
geom_smooth() +
facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
Another fancy plot:
size = precipitation (higher precipitation = larger size)
ggplot(weather_df, aes(x = date, y = tmax, color = name)) +
geom_point(aes(size = prcp), alpha = 0.3, size = 0.5) +
geom_smooth() +
facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
Assigning specific colors to specific things:
Need to assign the color in geom_point, not ggplot because aesthetic mappings takes variables from your dataset and maps them onto colors
weather_df |>
filter(name == "CentralPark_NY") |>
ggplot(aes(x = date, y = tmax)) +
geom_point(color = "blue")
Hex plot:
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_hex()
## Warning: Removed 17 rows containing non-finite values (`stat_binhex()`).
Line plot:
weather_df |>
filter(name == "Molokai_HI") |>
ggplot(aes(x = date, y = tmax)) +
geom_line()
Can combine line plot with points:
weather_df |>
filter(name == "Molokai_HI") |>
ggplot(aes(x = date, y = tmax)) +
geom_line(alpha = 0.3) +
geom_point(size = 0.3)
## Warning: Removed 1 rows containing missing values (`geom_point()`).
Easiest starting point for this is a basic histogram:
ggplot(weather_df, aes(x = tmax)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).
fill argument fills in colors for histogram
position = “dodge” places bars for each group side-by-side - bars should avoid each other rather than stack up on each other (however, this can get difficult –> easier to use density plots)
ggplot(weather_df, aes(x = tmax, fill = name)) +
geom_histogram(position = "dodge", binwidth = 2)
## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).
Density plot:
adjust argument is similar to binwidth in histograms
if you over-smooth or under-smooth, you may miss the message you’re trying to make
density plots show more distribution than box plots
ggplot(weather_df, aes(x = tmax, fill = name)) +
geom_density(alpha = 0.3, adjust = 2)
## Warning: Removed 17 rows containing non-finite values (`stat_density()`).
Basic boxplots:
ggplot(weather_df, aes(y = tmax)) +
geom_boxplot()
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).
Can compare by group, e.g., name:
ggplot(weather_df, aes(x = name, y = tmax)) +
geom_boxplot()
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).
Violin plot:
ggplot(weather_df, aes(x = name, y = tmax)) +
geom_violin()
## Warning: Removed 17 rows containing non-finite values (`stat_ydensity()`).
Ridge plot:
ggplot(weather_df, aes(x = tmax, y = name)) +
geom_density_ridges(scale = 0.9)
## Picking joint bandwidth of 1.54
## Warning: Removed 17 rows containing non-finite values
## (`stat_density_ridges()`).
ggp_weather =
ggplot(weather_df, aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5)
ggsave("ggp_weather.pdf", ggp_weather, width = 8, height = 5)
## Warning: Removed 17 rows containing missing values (`geom_point()`).
ggp_weather
## Warning: Removed 17 rows containing missing values (`geom_point()`).
You can also set options globally in the beginning after loading packages with this code:
knitr::opts_chunk$set( fig.width = 6, fig.asp = .6, out.width = “90%” )
fig.asp = aspect ratio
labs(x or y) renames axis on the graph labs(color) renames the legend in this case since the colors are the legend labs(title) adds a header at the top labs(caption) adds a caption at the bottom
scale_x_continuous(breaks = ) tells you where you want the tick marks on the x axis (labels = ) helps you label the x axis ticks can do scale_y_continuous (position = “right”) moves the y axis to the right side of the graph (trans = “sqrt”) transforms the data (limits = c(0, 30)) zooms into/scales the graph to fit 0-30 instead of the default -15 to 40 range
weather_df |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = 0.5) +
labs(
x = "Min daily temp (Degrees C)",
y = "Max daily temp",
color = "Location",
title = "Temperature plot",
caption = "The data was retreieved from moaa"
) +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15 C", "0 C", "15 C")
) +
scale_y_continuous(
position = "right",
limits = c(0, 30)
)
## Warning: Removed 302 rows containing missing values (`geom_point()`).
there are a lot of different scales you can adjust under scale_color
scale_color_hue works well for categorical variables e.g., + scale_color_hue(h = c(100, 300))
viridis package is good for colors option = “magma” because magma is one of the color scales besides viridis, for example discrete = TRUE relates to variable type (categorical vs. continuous)
weather_df |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = 0.5) +
labs(
x = "Min daily temp (Degrees C)",
y = "Max daily temp",
color = "Location",
title = "Temperature plot",
caption = "The data was retreieved from moaa"
) +
viridis::scale_color_viridis(discrete = TRUE)
## Warning: Removed 17 rows containing missing values (`geom_point()`).
other chart details:
theme_bw() inverts the default grey background to a white background be careful about the order in which you put this in - it can be a reset so try to put it high in order
theme_classic() is like theme_bw() hides gridlines and a thick border
theme_minimal() is my personal favorite
weather_df |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = 0.5) +
labs(
x = "Min daily temp (Degrees C)",
y = "Max daily temp",
color = "Location",
title = "Temperature plot",
caption = "The data was retreieved from moaa"
) +
viridis::scale_color_viridis(discrete = TRUE) +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values (`geom_point()`).
weather_df |>
ggplot(aes(x = date, y = tmax)) +
geom_point(aes(color = name)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
nyc_weather_df =
weather_df |>
filter(name == "CentralPark_NY")
hawaii_weather_df =
weather_df |>
filter(name == "Molokai_HI")
ggplot(nyc_weather_df, aes(x = date, y = tmax)) +
geom_point() +
geom_line(data = hawaii_weather_df) +
ThemePark::theme_barbie() +
labs(
title = "I made this barbie-themed plot for you"
)
faceting - I want this plot duplicated for another related variable not good if you want two completely different plots
weather_df |>
ggplot(aes(x = date, y = tmax, color = name)) +
geom_point() +
facet_grid(. ~ name) +
ThemePark::theme_barbie() +
labs(
title = "this one is slightly cuter"
)
## Warning: Removed 17 rows containing missing values (`geom_point()`).
to put them side by side:
ggp_temp_scatter =
weather_df |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = .5)
ggp_precip_density =
weather_df |>
ggplot(aes(x = prcp, color = name)) +
geom_density()
factor variables are thought of as categorical variables with order/levels, whereas characters are just characters with no structure this is important now because when ggplot has to figure out what goes onto which axis and the order of colors, it converts it to a factor variable in the background and by default uses alphabetical order need to change the variable structure to change the other
fct_reorder function lets you put name in order of some other variable fct_relevel is manual?
weather_df |>
mutate(
name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_MA"))
) |>
ggplot(aes(x = name, y = tmax)) +
geom_boxplot()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY",
## "Waterhole_MA"))`.
## Caused by warning:
## ! 1 unknown level in `f`: Waterhole_MA
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).
weather_df |>
mutate(
name = fct_reorder(name, tmax)
) |>
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Warning: Removed 17 rows containing non-finite values (`stat_ydensity()`).
litters_df =
read_csv("FAS_litters.csv") |>
janitor::clean_names() |>
separate(group, into = c("dose", "day_of_treatment"), sep = 3)
## Rows: 49 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Group, Litter Number
## dbl (6): GD0 weight, GD18 weight, GD of Birth, Pups born alive, Pups dead @ ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pups_df =
read_csv("FAS_pups.csv") |>
janitor::clean_names()
## Rows: 313 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Litter Number
## dbl (5): Sex, PD ears, PD eyes, PD pivot, PD walk
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
FAS_df =
left_join(pups_df, litters_df, by = "litter_number")
FAS_df |>
select(dose, day_of_treatment, starts_with("pd")) |>
pivot_longer(
pd_ears:pd_walk,
names_to = "outcome",
values_to = "postnatal_day"
) |>
drop_na() |>
mutate(
outcome =
fct_reorder(outcome, postnatal_day)
) |>
ggplot(aes(x = dose, y = postnatal_day)) +
geom_violin() +
facet_grid(day_of_treatment ~ outcome)
lubirdate: handy way of accessing month as a variable
weather_df =
rnoaa::meteo_pull_monitors(
c("USW00094728", "USW00022534", "USS0023B17S"),
var = c("PRCP", "TMIN", "TMAX"),
date_min = "2021-01-01",
date_max = "2022-12-31") |>
mutate(
name = recode(
id,
USW00094728 = "CentralPark_NY",
USW00022534 = "Molokai_HI",
USS0023B17S = "Waterhole_WA"),
tmin = tmin / 10,
tmax = tmax / 10,
month = lubridate::floor_date(date, unit = "month")) |>
select(name, id, everything())
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2023-09-28 10:19:14.527811 (8.524)
## file min/max dates: 1869-01-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly
## date created (size, mb): 2023-09-28 10:19:24.312619 (3.83)
## file min/max dates: 1949-10-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2023-09-28 10:19:23.539384 (0.994)
## file min/max dates: 1999-09-01 / 2023-09-30
weather_df |>
ggplot(aes(x = prcp)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (`stat_bin()`).
here are the big outliers:
weather_df |>
filter(prcp > 1000)
## # A tibble: 3 × 7
## name id date prcp tmax tmin month
## <chr> <chr> <date> <dbl> <dbl> <dbl> <date>
## 1 CentralPark_NY USW00094728 2021-08-21 1130 27.8 22.8 2021-08-01
## 2 CentralPark_NY USW00094728 2021-09-01 1811 25.6 17.2 2021-09-01
## 3 Molokai_HI USW00022534 2022-12-18 1120 23.3 18.9 2022-12-01
weather_df |>
filter(tmax >= 20, tmax <= 30) |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point()
if you run this, you’ll notice that grouping is sort of invisible only hint below is # groups: name comment
(n_obs = n()) tells you to give you the number of observations good way to determine sample size in groups you care about
weather_df |>
group_by(name, month) |>
summarize(n_obs = n())
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
## # A tibble: 72 × 3
## # Groups: name [3]
## name month n_obs
## <chr> <date> <int>
## 1 CentralPark_NY 2021-01-01 31
## 2 CentralPark_NY 2021-02-01 28
## 3 CentralPark_NY 2021-03-01 31
## 4 CentralPark_NY 2021-04-01 30
## 5 CentralPark_NY 2021-05-01 31
## 6 CentralPark_NY 2021-06-01 30
## 7 CentralPark_NY 2021-07-01 31
## 8 CentralPark_NY 2021-08-01 31
## 9 CentralPark_NY 2021-09-01 30
## 10 CentralPark_NY 2021-10-01 31
## # ℹ 62 more rows
can also count number of names (and edit the name)
weather_df |>
count(name, name = "n_obs")
## # A tibble: 3 × 2
## name n_obs
## <chr> <int>
## 1 CentralPark_NY 730
## 2 Molokai_HI 730
## 3 Waterhole_WA 730
weather_df |>
count(name, month) |>
pivot_wider(
names_from = name,
values_from = n
)
## # A tibble: 24 × 4
## month CentralPark_NY Molokai_HI Waterhole_WA
## <date> <int> <int> <int>
## 1 2021-01-01 31 31 31
## 2 2021-02-01 28 28 28
## 3 2021-03-01 31 31 31
## 4 2021-04-01 30 30 30
## 5 2021-05-01 31 31 31
## 6 2021-06-01 30 30 30
## 7 2021-07-01 31 31 31
## 8 2021-08-01 31 31 31
## 9 2021-09-01 30 30 30
## 10 2021-10-01 31 31 31
## # ℹ 14 more rows
can take us beyond simply counting
2 ways to remove NA:
weather_df |>
drop_na(tmax) |>
group_by(name) |>
summarize(
mean_tmax = mean(tmax),
median_tmax = median(tmax),
sd_tmax = sd(tmax)
)
## # A tibble: 3 × 4
## name mean_tmax median_tmax sd_tmax
## <chr> <dbl> <dbl> <dbl>
## 1 CentralPark_NY 17.7 18.9 9.96
## 2 Molokai_HI 28.3 28.3 1.80
## 3 Waterhole_WA 7.38 6.1 7.55
weather_df |>
group_by(name) |>
summarize(
mean_tmax = mean(tmax, na.rm = TRUE),
median_tmax = median(tmax, na.rm = TRUE),
sd_tmax = sd(tmax, na.rm = TRUE)
)
## # A tibble: 3 × 4
## name mean_tmax median_tmax sd_tmax
## <chr> <dbl> <dbl> <dbl>
## 1 CentralPark_NY 17.7 18.9 9.96
## 2 Molokai_HI 28.3 28.3 1.80
## 3 Waterhole_WA 7.38 6.1 7.55
weather_df |>
group_by(name, month) |>
summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
ggplot(aes(x = month, y = mean_tmax, color = name)) +
geom_point() +
geom_line()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
more pivot wider:
weather_df |>
group_by(name, month) |>
summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
pivot_wider(
names_from = name,
values_from = mean_tmax
)
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
## # A tibble: 24 × 4
## month CentralPark_NY Molokai_HI Waterhole_WA
## <date> <dbl> <dbl> <dbl>
## 1 2021-01-01 4.27 27.6 0.8
## 2 2021-02-01 3.87 26.4 -0.786
## 3 2021-03-01 12.3 25.9 2.62
## 4 2021-04-01 17.6 26.6 6.10
## 5 2021-05-01 22.1 28.6 8.20
## 6 2021-06-01 28.1 29.6 15.3
## 7 2021-07-01 28.4 30.0 17.3
## 8 2021-08-01 28.8 29.5 17.2
## 9 2021-09-01 24.8 29.7 12.6
## 10 2021-10-01 19.9 29.1 5.48
## # ℹ 14 more rows
sometimes it’s nice to format things as actual tables, especially if you’re sending to others: use knitr packages kable function
weather_df |>
group_by(name, month) |>
summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
pivot_wider(
names_from = name,
values_from = mean_tmax
) |>
knitr::kable(digits = 2)
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
| month | CentralPark_NY | Molokai_HI | Waterhole_WA |
|---|---|---|---|
| 2021-01-01 | 4.27 | 27.62 | 0.80 |
| 2021-02-01 | 3.87 | 26.37 | -0.79 |
| 2021-03-01 | 12.29 | 25.86 | 2.62 |
| 2021-04-01 | 17.61 | 26.57 | 6.10 |
| 2021-05-01 | 22.08 | 28.58 | 8.20 |
| 2021-06-01 | 28.06 | 29.59 | 15.25 |
| 2021-07-01 | 28.35 | 29.99 | 17.34 |
| 2021-08-01 | 28.81 | 29.52 | 17.15 |
| 2021-09-01 | 24.79 | 29.67 | 12.65 |
| 2021-10-01 | 19.93 | 29.13 | 5.48 |
| 2021-11-01 | 11.54 | 28.85 | 3.53 |
| 2021-12-01 | 9.59 | 26.19 | -2.10 |
| 2022-01-01 | 2.85 | 26.61 | 3.61 |
| 2022-02-01 | 7.65 | 26.83 | 2.99 |
| 2022-03-01 | 11.99 | 27.73 | 3.42 |
| 2022-04-01 | 15.81 | 27.72 | 2.46 |
| 2022-05-01 | 22.25 | 28.28 | 5.81 |
| 2022-06-01 | 26.09 | 29.16 | 11.13 |
| 2022-07-01 | 30.72 | 29.53 | 15.86 |
| 2022-08-01 | 30.50 | 30.70 | 18.83 |
| 2022-09-01 | 24.92 | 30.41 | 15.21 |
| 2022-10-01 | 17.43 | 29.22 | 11.88 |
| 2022-11-01 | 14.02 | 27.96 | 2.14 |
| 2022-12-01 | 6.76 | 27.35 | -0.46 |
can compute the month-level mean instead of one mean for the entire dataset, for example
weather_df |>
group_by(name) |>
mutate(
mean_tmax = mean(tmax, na.rm = TRUE),
centered_tmax = tmax - mean_tmax
) |>
ggplot(aes(x = date, y = centered_tmax, color = name)) +
geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).
min_rank() gives the ranking from lowest to highest min_rank(desc()) puts it in descending/reverse order filter for the coldest days in each month
weather_df |>
group_by(name, month) |>
mutate(tmax_rank = min_rank(tmax)) |>
filter(tmax_rank < 2)
## # A tibble: 92 × 8
## # Groups: name, month [72]
## name id date prcp tmax tmin month tmax_rank
## <chr> <chr> <date> <dbl> <dbl> <dbl> <date> <int>
## 1 CentralPark_NY USW00094728 2021-01-29 0 -3.8 -9.9 2021-01-01 1
## 2 CentralPark_NY USW00094728 2021-02-08 0 -1.6 -8.2 2021-02-01 1
## 3 CentralPark_NY USW00094728 2021-03-02 0 0.6 -6 2021-03-01 1
## 4 CentralPark_NY USW00094728 2021-04-02 0 3.9 -2.1 2021-04-01 1
## 5 CentralPark_NY USW00094728 2021-05-29 117 10.6 8.3 2021-05-01 1
## 6 CentralPark_NY USW00094728 2021-05-30 226 10.6 8.3 2021-05-01 1
## 7 CentralPark_NY USW00094728 2021-06-11 0 20.6 16.7 2021-06-01 1
## 8 CentralPark_NY USW00094728 2021-06-12 0 20.6 16.7 2021-06-01 1
## 9 CentralPark_NY USW00094728 2021-07-03 86 18.9 15 2021-07-01 1
## 10 CentralPark_NY USW00094728 2021-08-04 0 24.4 19.4 2021-08-01 1
## # ℹ 82 more rows
lag(): for example, how does yesterday’s temperature relate to today’s temperature/how does previous temperature impact current/future temperatures lag(tmax, 3) would go 3 rows back if you forget your grouping, you will get random values
weather_df |>
group_by(name) |>
mutate(
yesterday_tmax = lag(tmax)
)
## # A tibble: 2,190 × 8
## # Groups: name [3]
## name id date prcp tmax tmin month yesterday_tmax
## <chr> <chr> <date> <dbl> <dbl> <dbl> <date> <dbl>
## 1 CentralPark_NY USW000… 2021-01-01 157 4.4 0.6 2021-01-01 NA
## 2 CentralPark_NY USW000… 2021-01-02 13 10.6 2.2 2021-01-01 4.4
## 3 CentralPark_NY USW000… 2021-01-03 56 3.3 1.1 2021-01-01 10.6
## 4 CentralPark_NY USW000… 2021-01-04 5 6.1 1.7 2021-01-01 3.3
## 5 CentralPark_NY USW000… 2021-01-05 0 5.6 2.2 2021-01-01 6.1
## 6 CentralPark_NY USW000… 2021-01-06 0 5 1.1 2021-01-01 5.6
## 7 CentralPark_NY USW000… 2021-01-07 0 5 -1 2021-01-01 5
## 8 CentralPark_NY USW000… 2021-01-08 0 2.8 -2.7 2021-01-01 5
## 9 CentralPark_NY USW000… 2021-01-09 0 2.8 -4.3 2021-01-01 2.8
## 10 CentralPark_NY USW000… 2021-01-10 0 5 -1.6 2021-01-01 2.8
## # ℹ 2,180 more rows
show the day-to-day variation in temperature:
weather_df |>
group_by(name) |>
mutate(temp_change = tmax - lag(tmax)) |>
summarize(
sd_temp_change = sd(temp_change, na.rm = TRUE)
)
## # A tibble: 3 × 2
## name sd_temp_change
## <chr> <dbl>
## 1 CentralPark_NY 4.43
## 2 Molokai_HI 1.24
## 3 Waterhole_WA 3.04